library(tm)
library(stringr)
library(plyr)


#FIRST RUN "02_13_m5s_forum_import_tables.R"
#FIRST RUN "02_14_m5s_forum_data_cleaning.R"

# Define source of data
italian_names_by_gender <- "italian_names_by_gender.csv"


#### author_alpha ####

# Get vector with url field
url <- as.character(author_alpha$url)

# Create vector with the last part of the url (after character 46)
name_alpha <- substring(url, 47)
rm(url)

# Replace "-" with space
name_alpha <- gsub("-", " ", name_alpha)

# Return max number of words in one value
# max(sapply(name_alpha, countToken))

# Return number of empty values
# sum(name_alpha == '')

# Display how many values contain digits
# digit <- grepl("\\d+", name_alpha)
# table(digit)
# rm(digit)

# Add column to source dataframe
author_alpha$name_processed <- name_alpha

#write.table(author_alpha, "~/Desktop/author_alpha.csv", sep="\t")

####    ####


#### author_beta ####



# Get vector of names of comment's authors and sanitize name for possible negative value 
# in first char of string (it creates problems with igraph)
name_beta <- sapply(author_beta$name, sanitizeNegativeString, USE.NAMES = FALSE)

# If authorUrl is actual url replace name with url
# i <- 1; N <- length(author_beta$authorUrl)
# while (i <= N) {
#  if (grepl("http",author_beta$authorUrl[i])) {
#    name_beta[i] <- author_beta$authorUrl[i]
#  }
#  i <- i + 1
# }
# rm(N);rm(i);

# Display how many values contain digits
# digit <- grepl("\\d+", name_beta)
# table(digit)
# rm(digit)

# Return new_name_beta_foreign_key (FUN.VALUE set output value)
# WARNING: COMPUTATIONALLY INTENSIVE
name_alpha_url <- author_alpha$url
new_name_beta_foreign_key <- vapply(name_beta, crossTableNameMatch, FUN.VALUE = character(1), name_alpha, name_alpha_url, USE.NAMES = FALSE)
rm(name_alpha_url)

# If authorUrl is actual url replace name with url 
# if not already url or if name is duplicated (it will then be removed)
i <- 1; N <- length(author_beta$authorUrl)
while (i <= N) {
  if (!grepl("http",new_name_beta_foreign_key[i])) {
    if (grepl("http",author_beta$authorUrl[i])) {
      if (!appearsMoreThanOnce(author_beta$name[i],author_beta$name)) {
        new_name_beta_foreign_key[i] <- author_beta$authorUrl[i]
      }
    }
  }
  i <- i + 1
}
rm(N);rm(i);

author_beta$new_foreign_key <- new_name_beta_foreign_key

comment$authorUrl[
  !length(match(comment$authorUrl, author_beta$authorUrl))==0] <- # the test
  new_name_beta_foreign_key[match(comment$authorUrl, author_beta$authorUrl)]

# Remove from author_beta authors with "http://www.beppegrillo.it/listeciviche/author/"
pattern <- "http://www.beppegrillo.it/listeciviche/author/"
author_beta <- author_beta[-which(grepl(pattern,author_beta$new_foreign_key)),]
rm(pattern)

# Remove columns from author_beta_reduced
# drops <- c("authorUrl","isAnonymous", "timestamp")
# author_beta_reduced <- author_beta_reduced[,!(names(author_beta_reduced) %in% drops)]
# rm(drops)

# Remove author with same id
dup <- duplicated(author_beta$new_foreign_key)
author_beta <- author_beta[dup==FALSE,]
rm(dup)


####    ####


#### author_omega ####

# Append table author alpha and beta to unique author table

drops <- c("authorId","name")
temp_df_1 <- author_alpha[,!(names(author_alpha) %in% drops)]
temp_df_1 <- rename(temp_df_1, c("name_processed"="name", "url"="id"))
temp_df_1$member <- TRUE
temp_df_2 <- author_beta
drops <- c("authorUrl","isAnonymous")
temp_df_2 <- author_beta[,!(names(author_beta) %in% drops)]
temp_df_2 <- rename(temp_df_2, c("new_foreign_key"="id"))
temp_df_2$member <- FALSE
author_omega <- rbind(temp_df_1, temp_df_2)
rm(drops); rm(temp_df_1); rm(temp_df_2)

# Check integrity (if all comments are still linked to author)
# WARNING: COMPUTATIONALLY INTENSIVE
# checkIntegrity <- sapply(comment$authorUrl, match, author_omega$id)
# sum(is.na(checkIntegrity))
# rm(checkIntegrity)

rm(name_alpha); rm(name_beta);

# Create dataframe of name list
name_with_gender <- read.csv(italian_names_by_gender, header=F, colClasses = "character", strip.white=TRUE)
rm(italian_names_by_gender)

# Get a vector with gender attribute
author_omega$gender <- genderAttribution(author_omega$name, name_with_gender)
author_omega$gender[is.na(author_omega$gender)] <- "unknown"


# Remove empty rows
author_omega <- subset(author_omega, name!="")

# Add column for 2012 primary election
all_candidates_m5s <- read.table(paste0(data_path, "names_candidates_mps_2013.csv"), header=T, quote="\"", strip.white=TRUE)

# Process names (order firstname and surname + tolower)
all_candidates_m5s$n_processed <- sapply(all_candidates_m5s$NOME, name_first, name_with_gender)

# Prepare string
author_omega$name <- tolower(author_omega$name)
author_omega$name <- gsub(" $","", author_omega$name, perl=T)

author_omega$candidated13[author_omega$name%in%all_candidates_m5s$n_processed] <- TRUE
rm(all_candidates_m5s)

# Add column for 2013 general election
elected_mps <- read.csv(paste0(data_path,"names_elected_mps_2013.csv"))
elected_mps <- subset(elected_mps, Lista=="Movimento 5 Stelle")

# Prepare string
elected_mps$Eletto <- tolower(elected_mps$Eletto)
elected_mps$Eletto <- gsub(" $","", elected_mps$Eletto, perl=T)

author_omega$elected13[author_omega$name%in%elected_mps$Eletto] <- TRUE

# Manual corrections for candidated and elected
author_omega$candidated13[author_omega$name=="marino mastrangeli"] <- TRUE
author_omega$elected13[author_omega$name=="marino mastrangeli"] <- TRUE


rm(elected_mps)
rm(name_with_gender)

# Show table (include NA values)
# table(author_omega$gender, exclude=NULL)

# Show table (exclude NA values) (percentage)
# table(author_omega$gender)/length(author_omega$gender)

# Plot pie chart with gender distribution (exclude NA values)
# pie(table(author_omega$gender))

# Write tables
# write.table(author_omega, "author.csv", sep=",") 
# write.table(comment, "comment.csv", sep=",") 
# write.table(thread, "thread.csv", sep=",")

save(author_omega, file="02_15_m5s_forum_apr15_author_omega.RData")
save(comment, file="02_15_m5s_forum_apr15_comment.RData")
save(thread, file="02_15_m5s_forum_apr15_thread.RData")
